options(repos = c(CRAN = "https://cran.rstudio.com/"))library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom 1.0.7 ✔ rsample 1.2.1
✔ dials 1.3.0 ✔ tune 1.2.1
✔ infer 1.0.7 ✔ workflows 1.1.4
✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
✔ parsnip 1.2.1 ✔ yardstick 1.3.1
✔ recipes 1.1.0
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter() masks stats::filter()
✖ recipes::fixed() masks stringr::fixed()
✖ dplyr::lag() masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step() masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
library(powerjoin)
library(glue)
library(vip)
Attaching package: 'vip'
The following object is masked from 'package:utils':
vi
library(baguette)root <- 'https://gdex.ucar.edu/dataset/camels/file'
download.file('https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf',
'camels_attributes_v2.0.pdf')
types <- c("clim", "geol", "soil", "topo", "vege", "hydro")
remote_files <- glue('{root}/camels_{types}.txt')
local_files <- glue('{root}/camels_{types}.txt')
walk2(remote_files, local_files, download.file, quiet = TRUE)Warning in .f(.x[[i]], .y[[i]], ...): URL
https://gdex.ucar.edu/dataset/camels/file/camels_clim.txt: cannot open destfile
'https://gdex.ucar.edu/dataset/camels/file/camels_clim.txt', reason 'No such
file or directory'
Warning in .f(.x[[i]], .y[[i]], ...): download had nonzero exit status
Warning in .f(.x[[i]], .y[[i]], ...): URL
https://gdex.ucar.edu/dataset/camels/file/camels_geol.txt: cannot open destfile
'https://gdex.ucar.edu/dataset/camels/file/camels_geol.txt', reason 'No such
file or directory'
Warning in .f(.x[[i]], .y[[i]], ...): download had nonzero exit status
Warning in .f(.x[[i]], .y[[i]], ...): URL
https://gdex.ucar.edu/dataset/camels/file/camels_soil.txt: cannot open destfile
'https://gdex.ucar.edu/dataset/camels/file/camels_soil.txt', reason 'No such
file or directory'
Warning in .f(.x[[i]], .y[[i]], ...): download had nonzero exit status
Warning in .f(.x[[i]], .y[[i]], ...): URL
https://gdex.ucar.edu/dataset/camels/file/camels_topo.txt: cannot open destfile
'https://gdex.ucar.edu/dataset/camels/file/camels_topo.txt', reason 'No such
file or directory'
Warning in .f(.x[[i]], .y[[i]], ...): download had nonzero exit status
Warning in .f(.x[[i]], .y[[i]], ...): URL
https://gdex.ucar.edu/dataset/camels/file/camels_vege.txt: cannot open destfile
'https://gdex.ucar.edu/dataset/camels/file/camels_vege.txt', reason 'No such
file or directory'
Warning in .f(.x[[i]], .y[[i]], ...): download had nonzero exit status
Warning in .f(.x[[i]], .y[[i]], ...): URL
https://gdex.ucar.edu/dataset/camels/file/camels_hydro.txt: cannot open
destfile 'https://gdex.ucar.edu/dataset/camels/file/camels_hydro.txt', reason
'No such file or directory'
Warning in .f(.x[[i]], .y[[i]], ...): download had nonzero exit status
camels <- map(local_files, read_delim, show_col_types = FALSE)
camels <- power_full_join(camels ,by = 'gauge_id')##Question 1
#zero_q_freq represents the frequency of days when flow equals 0 mm/day. It is used to determine the number of days when there is no measurable flow in a river/stream of interest.ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
borders("state", colour = "gray50") +
geom_point(aes(color = q_mean)) +
scale_color_gradient(low = "pink", high = "dodgerblue") +
ggthemes::theme_map()scale_color_manual(values = c("dodgerblue", "yellow", "pink")) #lets you pick your own colors.<ggproto object: Class ScaleDiscrete, Scale, gg>
aesthetics: colour
axis_order: function
break_info: function
break_positions: function
breaks: waiver
call: call
clone: function
dimension: function
drop: TRUE
expand: waiver
get_breaks: function
get_breaks_minor: function
get_labels: function
get_limits: function
get_transformation: function
guide: legend
is_discrete: function
is_empty: function
labels: waiver
limits: NULL
make_sec_title: function
make_title: function
map: function
map_df: function
n.breaks.cache: NULL
na.translate: TRUE
na.value: grey50
name: waiver
palette: function
palette.cache: NULL
position: left
range: environment
rescale: function
reset: function
train: function
train_df: function
transform: function
transform_df: function
super: <ggproto object: Class ScaleDiscrete, Scale, gg>
##Question 2
library(ggplot2)
library(ggthemes)
library(ggpubr)
map_aridity <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
borders("state", colour = "gray50") +
geom_point(aes(color = aridity)) +
scale_color_gradient(low = "orange", high = "darkblue") +
ggthemes::theme_map() +
labs(title = "Map of Aridity", color = "Aridity") +
theme(legend.position = "right")
map_p_mean <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
borders("state", colour = "gray50") +
geom_point(aes(color = p_mean)) +
scale_color_gradient(low = "yellow", high = "darkblue") +
ggthemes::theme_map() +
labs(title = "Map of Precipitation (p_mean)", color = "Precipitation") +
theme(legend.position = "right")
combined_map <- ggarrange(map_aridity, map_p_mean, ncol = 1, nrow = 2)
print(map_aridity)print(map_p_mean)print(combined_map)camels |>
select(aridity, p_mean, q_mean) |>
drop_na() |>
cor() aridity p_mean q_mean
aridity 1.0000000 -0.7550090 -0.5817771
p_mean -0.7550090 1.0000000 0.8865757
q_mean -0.5817771 0.8865757 1.0000000
ggplot(camels, aes(x = aridity, y = p_mean)) +
geom_point(aes(color = q_mean)) +
geom_smooth(method = "lm", color = "red", linetype = 2) +
scale_color_viridis_c() +
theme_linedraw() +
theme(legend.position = "bottom") +
labs(title = "Aridity vs Rainfall vs Runnoff",
x = "Aridity",
y = "Rainfall",
color = "Mean Flow")`geom_smooth()` using formula = 'y ~ x'
ggplot(camels, aes(x = aridity, y = p_mean)) +
geom_point(aes(color = q_mean)) +
geom_smooth(method = "lm") +
scale_color_viridis_c() +
scale_x_log10() +
scale_y_log10() +
theme_linedraw() +
theme(legend.position = "bottom") +
labs(title = "Aridity vs Rainfall vs Runnoff",
x = "Aridity",
y = "Rainfall",
color = "Mean Flow")`geom_smooth()` using formula = 'y ~ x'
ggplot(camels, aes(x = aridity, y = p_mean)) +
geom_point(aes(color = q_mean)) +
geom_smooth(method = "lm") +
scale_color_viridis_c(trans = "log") +
scale_x_log10() +
scale_y_log10() +
theme_linedraw() +
theme(legend.position = "bottom",
legend.key.width = unit(2.5, "cm"),
legend.key.height = unit(.5, "cm")) +
labs(title = "Aridity vs Rainfall vs Runnoff",
x = "Aridity",
y = "Rainfall",
color = "Mean Flow") `geom_smooth()` using formula = 'y ~ x'
set.seed(123)
camels <- camels |>
mutate(logQmean = log(q_mean))
camels_split <- initial_split(camels, prop = 0.8)
camels_train <- training(camels_split)
camels_test <- testing(camels_split)
camels_cv <- vfold_cv(camels_train, v = 10)rec <- recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%
step_log(all_predictors()) %>%
step_interact(terms = ~ aridity:p_mean) |>
step_naomit(all_predictors(), all_outcomes())baked_data <- prep(rec, camels_train) |>
bake(new_data = NULL)
lm_base <- lm(logQmean ~ aridity * p_mean, data = baked_data)
summary(lm_base)
Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)
Residuals:
Min 1Q Median 3Q Max
-2.91162 -0.21601 -0.00716 0.21230 2.85706
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.77586 0.16365 -10.852 < 2e-16 ***
aridity -0.88397 0.16145 -5.475 6.75e-08 ***
p_mean 1.48438 0.15511 9.570 < 2e-16 ***
aridity:p_mean 0.10484 0.07198 1.457 0.146
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared: 0.7697, Adjusted R-squared: 0.7684
F-statistic: 591.6 on 3 and 531 DF, p-value: < 2.2e-16
summary(lm(logQmean ~ aridity + p_mean + aridity_x_p_mean, data = baked_data))
Call:
lm(formula = logQmean ~ aridity + p_mean + aridity_x_p_mean,
data = baked_data)
Residuals:
Min 1Q Median 3Q Max
-2.91162 -0.21601 -0.00716 0.21230 2.85706
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.77586 0.16365 -10.852 < 2e-16 ***
aridity -0.88397 0.16145 -5.475 6.75e-08 ***
p_mean 1.48438 0.15511 9.570 < 2e-16 ***
aridity_x_p_mean 0.10484 0.07198 1.457 0.146
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared: 0.7697, Adjusted R-squared: 0.7684
F-statistic: 591.6 on 3 and 531 DF, p-value: < 2.2e-16
test_data <- bake(prep(rec), new_data = camels_test)
test_data$lm_pred <- predict(lm_base, newdata = test_data)metrics(test_data, truth = logQmean, estimate = lm_pred)# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.583
2 rsq standard 0.742
3 mae standard 0.390
ggplot(test_data, aes(x = logQmean, y = lm_pred, colour = aridity)) +
scale_color_gradient2(low = "brown", mid = "orange", high = "darkgreen") +
geom_point() +
geom_abline(linetype = 2) +
theme_linedraw() +
labs(title = "Linear Model: Observed vs Predicted",
x = "Observed Log Mean Flow",
y = "Predicted Log Mean Flow",
color = "Aridity")lm_model <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
lm_wf <- workflow() %>%
add_recipe(rec) %>%
add_model(lm_model) %>%
fit(data = camels_train)
summary(extract_fit_engine(lm_wf))$coefficients Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity -0.8839738 0.16144589 -5.475357 6.745512e-08
p_mean 1.4843771 0.15511117 9.569762 4.022500e-20
aridity_x_p_mean 0.1048449 0.07198145 1.456555 1.458304e-01
summary(lm_base)$coefficients Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity -0.8839738 0.16144589 -5.475357 6.745512e-08
p_mean 1.4843771 0.15511117 9.569762 4.022500e-20
aridity:p_mean 0.1048449 0.07198145 1.456555 1.458304e-01
lm_data <- augment(lm_wf, new_data = camels_test)
dim(lm_data)[1] 135 61
metrics(lm_data, truth = logQmean, estimate = .pred)# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.583
2 rsq standard 0.742
3 mae standard 0.390
ggplot(lm_data, aes(x = logQmean, y = .pred, colour = aridity)) +
scale_color_viridis_c() +
geom_point() +
geom_abline() +
theme_linedraw()install.packages("ranger")
The downloaded binary packages are in
/var/folders/9z/jhvcp6pd2jdgn7z7pd30zw1w0000gn/T//Rtmp8QmRpl/downloaded_packages
library(baguette)
library(tidymodels)
rf_model <- rand_forest() %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("regression")
rf_wf <- workflow() %>%
add_recipe(rec) %>%
add_model(rf_model)
rf_fit <- fit(rf_wf, data = camels_train)
print(rf_fit)══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
3 Recipe Steps
• step_log()
• step_interact()
• step_naomit()
── Model ───────────────────────────────────────────────────────────────────────
Ranger result
Call:
ranger::ranger(x = maybe_data_frame(x), y = y, importance = ~"impurity", num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
Type: Regression
Number of trees: 500
Sample size: 535
Number of independent variables: 3
Mtry: 1
Target node size: 5
Variable importance mode: impurity
Splitrule: variance
OOB prediction error (MSE): 0.3201147
R squared (OOB): 0.7715377
rf_data <- augment(rf_fit, new_data = camels_test)
dim(rf_data)[1] 135 60
metrics(rf_data, truth = logQmean, estimate = .pred)# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.592
2 rsq standard 0.736
3 mae standard 0.367
ggplot(rf_data, aes(x = logQmean, y = .pred, colour = aridity)) +
scale_color_viridis_c() +
geom_point() +
geom_abline() +
theme_linedraw()wf <- workflow_set(list(rec), list(lm_model, rf_model)) %>%
workflow_map('fit_resamples', resamples = camels_cv)
autoplot(wf)rank_results(wf, rank_metric = "rsq", select_best = TRUE)# A tibble: 4 × 9
wflow_id .config .metric mean std_err n preprocessor model rank
<chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
1 recipe_linear_reg Prepro… rmse 0.569 0.0260 10 recipe line… 1
2 recipe_linear_reg Prepro… rsq 0.770 0.0223 10 recipe line… 1
3 recipe_rand_fore… Prepro… rmse 0.565 0.0249 10 recipe rand… 2
4 recipe_rand_fore… Prepro… rsq 0.769 0.0261 10 recipe rand… 2
##Question 3
install.packages("xgboost")
The downloaded binary packages are in
/var/folders/9z/jhvcp6pd2jdgn7z7pd30zw1w0000gn/T//Rtmp8QmRpl/downloaded_packages
library(tidymodels)
library(baguette)
library(tidyr)
xgboost_model <- boost_tree() %>%
set_engine("xgboost") %>%
set_mode("regression")
nn_model <- bag_mlp() %>%
set_engine("nnet") %>%
set_mode("regression")
wf_set <- workflow_set(
list(rec),
list(lm_model, rf_model, xgboost_model, nn_model)
)
wf_set_results <- wf_set %>%
workflow_map("fit_resamples", resamples = camels_cv)
autoplot(wf_set_results)rank_results(wf_set_results, rank_metric = "rsq", select_best = TRUE)# A tibble: 8 × 9
wflow_id .config .metric mean std_err n preprocessor model rank
<chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
1 recipe_bag_mlp Prepro… rmse 0.541 0.0257 10 recipe bag_… 1
2 recipe_bag_mlp Prepro… rsq 0.792 0.0240 10 recipe bag_… 1
3 recipe_rand_fore… Prepro… rmse 0.561 0.0256 10 recipe rand… 2
4 recipe_rand_fore… Prepro… rsq 0.773 0.0262 10 recipe rand… 2
5 recipe_linear_reg Prepro… rmse 0.569 0.0260 10 recipe line… 3
6 recipe_linear_reg Prepro… rsq 0.770 0.0223 10 recipe line… 3
7 recipe_boost_tree Prepro… rmse 0.600 0.0289 10 recipe boos… 4
8 recipe_boost_tree Prepro… rsq 0.745 0.0268 10 recipe boos… 4
#I would move forward with the Bagged MLP model because it is the best model according to both RMSE and R-squared values. It provides the most accurate predictions out of the four models but Linear Regression or Random Forest could be used as well due to falling just short of Bagged MLP.##Build Your Own
install.packages("rsample")
The downloaded binary packages are in
/var/folders/9z/jhvcp6pd2jdgn7z7pd30zw1w0000gn/T//Rtmp8QmRpl/downloaded_packages
library(rsample)
set.seed(123)
camels_split <- initial_split(camels, prop = 0.75)
camels_train <- training(camels_split)
camels_test <- testing(camels_split)
camels_cv <- vfold_cv(camels_train, v = 10)formula <- logQmean ~ aridity + p_mean + vege + topo + geollibrary(tidyverse)
library(recipes)
rec <- recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%
step_log(all_predictors()) %>%
step_interact(terms = ~ aridity:p_mean) %>%
step_naomit(all_predictors(), all_outcomes())library(tidymodels)
rf_model <- rand_forest() %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("regression")
lm_model <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
xgboost_model <- boost_tree() %>%
set_engine("xgboost") %>%
set_mode("regression")wf_set <- workflow_set(
list(rec),
list(lm_model, rf_model, xgboost_model)
)
wf_set_results <- wf_set %>%
workflow_map("fit_resamples", resamples = camels_cv)
autoplot(wf_set_results)rank_results(wf_set_results, rank_metric = "rsq", select_best = TRUE)# A tibble: 6 × 9
wflow_id .config .metric mean std_err n preprocessor model rank
<chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
1 recipe_rand_fore… Prepro… rmse 0.547 0.0250 10 recipe rand… 1
2 recipe_rand_fore… Prepro… rsq 0.788 0.0178 10 recipe rand… 1
3 recipe_linear_reg Prepro… rmse 0.574 0.0260 10 recipe line… 2
4 recipe_linear_reg Prepro… rsq 0.769 0.0225 10 recipe line… 2
5 recipe_boost_tree Prepro… rmse 0.580 0.0259 10 recipe boos… 3
6 recipe_boost_tree Prepro… rsq 0.768 0.0200 10 recipe boos… 3
best_model <- rf_model
best_wf <- workflow() %>%
add_recipe(rec) %>%
add_model(best_model) %>%
fit(data = camels_train)
test_data <- bake(prep(rec), new_data = camels_test)
test_data$rf_pred <- predict(best_wf, new_data = test_data)$.predWarning in bake.step_log(step, new_data = new_data): NaNs produced
metrics(test_data, truth = logQmean, estimate = rf_pred)# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.793
2 rsq standard 0.525
3 mae standard 0.597
ggplot(test_data, aes(x = logQmean, y = rf_pred, colour = aridity)) +
scale_color_viridis_c() +
geom_point() +
geom_abline() +
theme_linedraw() +
labs(title = "Random Forest: Observed vs Predicted",
x = "Observed Log Mean Flow",
y = "Predicted Log Mean Flow",
color = "Aridity")library(tidymodels)
final_model <- workflow() %>%
add_recipe(rec) %>%
add_model(best_model) %>%
fit(data = camels_train)
final_predictions <- augment(final_model, new_data = camels_test)
ggplot(final_predictions, aes(x = logQmean, y = .pred)) +
geom_point() +
geom_abline(linetype = 2) +
theme_minimal() +
labs(title = "Observed vs Predicted Log Mean Flow",
x = "Observed",
y = "Predicted")